Each chunk represents a code window in the book. Please note that some chunks could not run properly or take a too much time to do it. The first line of these chunks is a comment, starting with the simbol # to clarify why is this.

5.1 Descriptive Statistics

library(mlbench)
data(PimaIndiansDiabetes2)
? PimaIndiansDiabetes2
head(PimaIndiansDiabetes2)
##   pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1        6     148       72      35      NA 33.6    0.627  50      pos
## 2        1      85       66      29      NA 26.6    0.351  31      neg
## 3        8     183       64      NA      NA 23.3    0.672  32      pos
## 4        1      89       66      23      94 28.1    0.167  21      neg
## 5        0     137       40      35     168 43.1    2.288  33      pos
## 6        5     116       74      NA      NA 25.6    0.201  30      neg
attach(PimaIndiansDiabetes2)
## The following object is masked from package:datasets:
## 
##     pressure

5.1.1 Position measures

mean(c(4, 5, 8, 2, 8, 4, 5, 9, 2, 7))
## [1] 5.4
mean(mass, na.rm=TRUE)
## [1] 32.45746
sapply(PimaIndiansDiabetes2[, -9], mean, na.rm=TRUE)
##    pregnant     glucose    pressure     triceps     insulin        mass 
##   3.8450521 121.6867628  72.4051842  29.1534196 155.5482234  32.4574637 
##    pedigree         age 
##   0.4718763  33.2408854
sales.per <- c(0.10, -0.05, 0.20, 0.05, -0.03)
sales <- 1 + sales.per
5000 * prod(sales)
## [1] 6385.995
5000 * (1.050151) ^ 5
## [1] 6385.998
5000 * (1.054) ^ 5
## [1] 6503.888
library(psych)
geometric.mean(sales) - 1
## [1] 0.05015091
23 / 10 + 23 / 30
## [1] 3.066667
46 / 20
## [1] 2.3
speeds = c(10, 30)
harmonic.mean(speeds, na.rm=TRUE)
## [1] 15
46 / 15
## [1] 3.066667
median(1 : 7)
## [1] 4
median(1 : 6)
## [1] 3.5
median(mass , na.rm=TRUE)
## [1] 32.3
mean(c(1 : 7, 30))
## [1] 7.25
median(c(1 : 7, 30))
## [1] 4.5
sapply(PimaIndiansDiabetes2[, -9], median, na.rm=TRUE)
## pregnant  glucose pressure  triceps  insulin     mass pedigree      age 
##   3.0000 117.0000  72.0000  29.0000 125.0000  32.3000   0.3725  29.0000
library(DescTools)
## 
## Attaching package: 'DescTools'
## The following objects are masked from 'package:psych':
## 
##     AUC, ICC, SD
Mode(mass)
## [1] NA
## attr(,"freq")
## [1] NA
sapply(PimaIndiansDiabetes2, Mode, na.rm=TRUE)
## $pregnant
## [1] 1
## attr(,"freq")
## [1] 135
## 
## $glucose
## [1]  99 100
## attr(,"freq")
## [1] 17
## 
## $pressure
## [1] 70
## attr(,"freq")
## [1] 57
## 
## $triceps
## [1] 32
## attr(,"freq")
## [1] 31
## 
## $insulin
## [1] 105
## attr(,"freq")
## [1] 11
## 
## $mass
## [1] 32
## attr(,"freq")
## [1] 13
## 
## $pedigree
## [1] 0.254 0.258
## attr(,"freq")
## [1] 6
## 
## $age
## [1] 22
## attr(,"freq")
## [1] 72
## 
## $diabetes
## [1] neg
## attr(,"freq")
## [1] 500
## Levels: neg pos
# plot histogram
hist(mass , col="seashell", border="black", xlab="Body mass index", main="Pima Indians Diabetes")
# add mean
abline(v=mean(mass, na.rm=TRUE), col="red", lwd=2)
# add median
abline(v=median(mass, na.rm=TRUE), col="blue", lwd=2)
abline(v=Mode(mass), col="seagreen", lwd=2) # add mode
legend(x="topright", c("Mean", "Median", "Mode"),
col=c("red", "blue", "seagreen"), lwd=c(2, 2)) # legend

quantile(mass, probs=0.65, na.rm=TRUE)
##   65% 
## 34.54
quantile(mass, na.rm=TRUE)
##   0%  25%  50%  75% 100% 
## 18.2 27.5 32.3 36.6 67.1
quantile(mass, probs=seq(0, 1, 0.10), na.rm=TRUE)
##    0%   10%   20%   30%   40%   50%   60%   70%   80%   90%  100% 
## 18.20 24.00 26.20 28.40 30.34 32.30 33.80 35.50 37.88 41.62 67.10
percent.fun <- ecdf(mass)
percent.fun(40)
## [1] 0.8731836
percent.fun(median(mass , na.rm=TRUE))
## [1] 0.5019815
summary(mass)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   18.20   27.50   32.30   32.46   36.60   67.10      11
summary(PimaIndiansDiabetes2)
##     pregnant         glucose         pressure         triceps     
##  Min.   : 0.000   Min.   : 44.0   Min.   : 24.00   Min.   : 7.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 64.00   1st Qu.:22.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :29.00  
##  Mean   : 3.845   Mean   :121.7   Mean   : 72.41   Mean   :29.15  
##  3rd Qu.: 6.000   3rd Qu.:141.0   3rd Qu.: 80.00   3rd Qu.:36.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##                   NA's   :5       NA's   :35       NA's   :227    
##     insulin            mass          pedigree           age        diabetes 
##  Min.   : 14.00   Min.   :18.20   Min.   :0.0780   Min.   :21.00   neg:500  
##  1st Qu.: 76.25   1st Qu.:27.50   1st Qu.:0.2437   1st Qu.:24.00   pos:268  
##  Median :125.00   Median :32.30   Median :0.3725   Median :29.00            
##  Mean   :155.55   Mean   :32.46   Mean   :0.4719   Mean   :33.24            
##  3rd Qu.:190.00   3rd Qu.:36.60   3rd Qu.:0.6262   3rd Qu.:41.00            
##  Max.   :846.00   Max.   :67.10   Max.   :2.4200   Max.   :81.00            
##  NA's   :374      NA's   :11

5.1.2 Dispersion measures

IQR(mass, na.rm=TRUE)
## [1] 9.1
sapply(PimaIndiansDiabetes2[, -9], IQR, na.rm=TRUE)
## pregnant  glucose pressure  triceps  insulin     mass pedigree      age 
##   5.0000  42.0000  16.0000  14.0000 113.7500   9.1000   0.3825  17.0000
boxplot(mass, xlab="Body mass index", col="papayawhip")

boxplot(pressure, xlab="Systolic blood pressure", col="pink")

boxplot(mass)$out

## [1] 53.2 55.0 67.1 52.3 52.3 52.9 59.4 57.3
boxplot(pressure)$out

##  [1]  30 110 108 122  30 110 108 110  24  38 106 106 106 114
boxplot(mass, range=3, xlab="Body mass index", col="papayawhip")

boxplot(pressure, range=3, xlab="Systolic blood pressure", col="pink")

boxplot(mass , range=3)$out

## [1] 67.1
boxplot(pressure , range=3)$out

## numeric(0)
var.p <- function(x){
  var(x, na.rm=TRUE) * (length(x) - 1) / (length(x))
}
sd.p <- function(x){
  sqrt(var.p(x))
}
class1 <- c(5, 5, 5, 5, 5, 5, 5, 5, 5, 5)
class2 <- c(0, 0, 0, 0, 0, 10, 10, 10, 10, 10)
var.p(class1)
## [1] 0
var.p(class2)
## [1] 25
sd.p(class1)
## [1] 0
sd.p(class2)
## [1] 5
var.p(mass)
## [1] 47.89302
sd.p(mass)
## [1] 6.920478
sapply(PimaIndiansDiabetes2[, -9], sd.p)
##    pregnant     glucose    pressure     triceps     insulin        mass 
##   3.3673836  30.5157546  12.3740943  10.4701592 118.6985020   6.9204784 
##    pedigree         age 
##   0.3311128  11.7525726
sd.p(mass) / mean(mass , na.rm=TRUE)
## [1] 0.2132169
sd.p(pressure) / mean(pressure , na.rm=TRUE)
## [1] 0.1709007

5.1.3 Shape measures

library(moments)
skewness(pressure, na.rm=TRUE)
## [1] 0.133878
skewness(pregnant, na.rm=TRUE)
## [1] 0.8999119
x <- c(rep(1, 100), rep(2, 140), rep(3, 25), rep(4, 15), rep(5, 10))
y <- c(rep(1, 100), rep(2, 140), rep(3, 30), rep(4, 20), rep(5, 15))
skewness(x)
## [1] 1.340999
skewness(y)
## [1] 1.214411
x <- c(rep(1, 30), rep(2, 40), rep(3, 250), rep(4, 400), rep(5, 150), rep(6, 120), rep(7, 10))
skewness(x)
## [1] 0
kurtosis(triceps, na.rm=TRUE)
## [1] 5.897361
sapply(PimaIndiansDiabetes2[,-9], FUN=kurtosis, na.rm=TRUE)
## pregnant  glucose pressure  triceps  insulin     mass pedigree      age 
## 3.150383 2.716919 3.896780 5.897361 9.274775 3.849771 8.550792 3.631177

5.1.4 Concentration measures

salary1 <- c(10, 15, 20, 21, 22, 35, 35, 38, 60, 100)
salary2 <- c(21, 22, 23, 25, 32, 38, 42, 46, 46, 50)
Gini(salary1)
## [1] 0.3963795
Gini(salary2)
## [1] 0.1935588
Gini(pregnant, na.rm=TRUE)
## [1] 0.4808656
Gini(pressure, na.rm=TRUE)
## [1] 0.09480261
plot(Lc(pregnant), col="red", lwd=2, main="Lorentz curve for pregnant", xlab="Cumulative individuals", ylab="Cumulative values")

plot(Lc(pressure, na.rm=TRUE), col="blue", lwd=2, main="Lorentz curve for pressure", xlab="Cumulative individuals", ylab="Cumulative values")

library(ineq)
## Registered S3 methods overwritten by 'ineq':
##   method   from     
##   plot.Lc  DescTools
##   lines.Lc DescTools
## 
## Attaching package: 'ineq'
## The following objects are masked from 'package:DescTools':
## 
##     Atkinson, Gini, Herfindahl, Lc, Rosenbluth
Theil(pregnant)
## [1] 0.2476443
Theil(pressure)
## [1] 0.01478803

5.1.5 Exercises

5.2 Statistical Inference

5.2.1 Random variables

class(mass)
## [1] "numeric"
class(pregnant)
## [1] "numeric"
sort(unique(pregnant))
##  [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 17
loaded <- data.frame(values=c(1, 2, 3, 4, 5, 6), probs=c(1 / 10, 1 / 10, 3 / 10, 2 / 10, 2 / 10, 1 / 10))
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
ggplot(data=loaded , aes(x=values, y=probs)) +
  geom_bar(stat="identity", fill="aquamarine3") +
  ylim(c(0, 0.35))

ggplot(data=PimaIndiansDiabetes2, aes(x=mass)) +
  geom_density() +
  xlab("Mass") +
  ylab("Density")
## Warning: Removed 11 rows containing non-finite values (stat_density).

ggplot(data=PimaIndiansDiabetes2, aes(x=mass)) +
  stat_ecdf() +
  xlab("Mass") +
  ylab("Cumulative density")
## Warning: Removed 11 rows containing non-finite values (stat_ecdf).

pdf.density <- density(mass , na.rm=TRUE)
pdf.density$x[150]
## [1] 30.42878
pdf.density$y[150]
## [1] 0.05387463
ecdf.cumul <- ecdf(mass)
ecdf.cumul(35)
## [1] 0.677675
set.seed(5)
x <- 1 : 50
sample(x, size=10)
##  [1]  2 43 15 11 41 21 30  7 19  3
set.seed(5)
sample(c("H", "T"), 25, replace=TRUE)
##  [1] "T" "H" "H" "H" "H" "H" "H" "H" "T" "H" "H" "H" "H" "T" "T" "H" "H" "T" "T"
## [20] "T" "H" "T" "H" "H" "T"
set.seed(5)
sample(1 : 6, size=10, prob=c(0.1, 0.1, 0.1, 0.1, 0.1, 0.5), replace=TRUE)
##  [1] 6 4 1 6 6 5 3 2 1 6

5.2.2 Probability distributions

? Distributions
dbinom(2, size=5, prob=0.2)
## [1] 0.2048
pbinom(2, 5, 0.2, lower.tail=TRUE)
## [1] 0.94208
pbinom(2, 5, 0.2, lower.tail=FALSE)
## [1] 0.05792
qbinom(0.95, 5, 0.2)
## [1] 3
set.seed(5)
rbinom(10, 5, 0.2)
##  [1] 0 1 2 0 0 1 1 2 3 0
values <- 0 : 5
plot(values, dbinom(values , 5, 0.2), type="b", xlab="Values", ylab="Probabilities", main="Mass")

plot(values, pbinom(values , 5, 0.2), type="b", xlab="Values", ylab="Probabilities", main="Cumulative")

values <- 0 : 11
plot(values, dpois(values, 5), type="b", xlab = "Values", ylab="Probabilities", main="Mass")

plot(ppois(values, 5), type="b", xlab = "Values", ylab="Probabilities", main="Cumulative")

pnorm(80, 70, 15) - pnorm(60, 70, 15)
## [1] 0.4950149
x <- seq(40, 100, by=0.5)
curve(dnorm(x, 70, 15), xlim=c(40, 100), col="blue", lwd=2, xlab="x", ylab="f(x)", main="Density function N(70,15)")

curve(pnorm(x, 70, 15), xlim=c(40, 100), col="blue", lwd=2, xlab="x", ylab="f(x)", main="Cumulative function N(70,15)")

set.seed(1)
sample <- rnorm(1000, 70, 15)
hist(sample, freq=FALSE, breaks=seq(20, 130, 10), col="seashell", xlab="Heart rate", main="Histogram for simulation of N(70,15)")
curve(dnorm(x, 70, 15), xlim=c(20, 130), col="blue", lwd=2, add=TRUE)

qqnorm(mass, main="Normal Q-Q plot for mass")
qqline(mass, col="blue", lwd=2) # adds a reference line

qqnorm(rnorm(1000, 70, 15), main="Normal Q-Q plot for N(70,15)")
qqline(rnorm(1000, 70, 15), col="blue", lwd=2)

m = 10000 # sample size of the final variable
lambda = 4 # poisson parameter
par(mfrow=c(2, 2))
for (n in c(1, 5, 10, 100)) {
  x <- NULL #initialization of x
  for (i in 1:m) {
    x <- c(x, mean(rpois(n, lambda)))
  }
  x.normalized <- (x-lambda)/(sqrt(lambda/n))
  hist(x.normalized , col="seashell",
  main=paste("Mean of", n,"Po(4)"), xlab="Value")
}

n = 100
m = 10000
mu = mean(mass , na.rm=TRUE)
sigma = sd(mass , na.rm=TRUE)
x <- NULL
for (i in 1:m) {
  x <- c(x, mean(sample(na.omit(mass), n)))
}
y = (x-mu)/(sigma/sqrt(n))
hist(y, col="seashell", main=paste(paste("Mean of 100",sep=" "),"mass",sep=" "), xlab="Value")

5.2.3 Confidence intervals and hypothesis tests

mean(triceps, na.rm=TRUE)
## [1] 29.15342
sd(triceps, na.rm=TRUE) ^ 2
## [1] 109.7672
conf.level <- 0.95
t <- qt((1 - conf.level) / 2, df=length(triceps) - 1, lower.tail = FALSE)
sigma <- sd(triceps , na.rm=TRUE)
mean <- mean(triceps , na.rm=TRUE)
sd.mean <- sigma / sqrt(length(triceps) - sum(is.na(triceps)))
interval <- c(mean - t * sd.mean, mean + t * sd.mean)
interval
## [1] 28.26918 30.03766
MeanCI(triceps, na.rm=TRUE, conf.level=0.95)
##     mean   lwr.ci   upr.ci 
## 29.15342 28.26859 30.03825
library(boot)
## 
## Attaching package: 'boot'
## The following object is masked from 'package:psych':
## 
##     logit
set.seed(1)
resampling <- boot(na.omit(triceps), function(x, i) mean(x[i]), R=10000)
boot.ci(resampling, conf = 0.95)
## Warning in boot.ci(resampling, conf = 0.95): bootstrap variances needed for
## studentized intervals
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 10000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = resampling, conf = 0.95)
## 
## Intervals : 
## Level      Normal              Basic         
## 95%   (28.28, 30.04 )   (28.26, 30.02 )  
## 
## Level     Percentile            BCa          
## 95%   (28.29, 30.05 )   (28.32, 30.08 )  
## Calculations and Intervals on Original Scale
VarCI(mass, na.rm=TRUE, conf.level=0.95)
##      var   lwr.ci   upr.ci 
## 47.95546 43.46581 53.18227
t.test(pressure, mu=70, alternative="greater")
## 
##  One Sample t-test
## 
## data:  pressure
## t = 5.259, df = 732, p-value = 9.516e-08
## alternative hypothesis: true mean is greater than 70
## 95 percent confidence interval:
##  71.65196      Inf
## sample estimates:
## mean of x 
##  72.40518
t.test(pressure, mu=72, alternative="two.sided")
## 
##  One Sample t-test
## 
## data:  pressure
## t = 0.88595, df = 732, p-value = 0.3759
## alternative hypothesis: true mean is not equal to 72
## 95 percent confidence interval:
##  71.50732 73.30305
## sample estimates:
## mean of x 
##  72.40518
t.test(pressure, mu=71.4, alternative="two.sided")
## 
##  One Sample t-test
## 
## data:  pressure
## t = 2.1979, df = 732, p-value = 0.02827
## alternative hypothesis: true mean is not equal to 71.4
## 95 percent confidence interval:
##  71.50732 73.30305
## sample estimates:
## mean of x 
##  72.40518
test <- t.test(pressure, mu=70, alternative="greater")
test$p.value
## [1] 9.515989e-08
t.test(insulin, glucose, mu=0, alternative="two.sided")
## 
##  Welch Two Sample t-test
## 
## data:  insulin and glucose
## t = 5.5647, df = 420.03, p-value = 4.688e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  21.90042 45.82250
## sample estimates:
## mean of x mean of y 
##  155.5482  121.6868
t.test(insulin, glucose, mu=0, alternative="less")
## 
##  Welch Two Sample t-test
## 
## data:  insulin and glucose
## t = 5.5647, df = 420.03, p-value = 1
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
##      -Inf 43.89268
## sample estimates:
## mean of x mean of y 
##  155.5482  121.6868
t.test(insulin, glucose, mu=30, alternative="two.sided")
## 
##  Welch Two Sample t-test
## 
## data:  insulin and glucose
## t = 0.63458, df = 420.03, p-value = 0.5261
## alternative hypothesis: true difference in means is not equal to 30
## 95 percent confidence interval:
##  21.90042 45.82250
## sample estimates:
## mean of x mean of y 
##  155.5482  121.6868
VarTest(pressure, sigma.squared=153)
## 
##  One Sample Chi-Square test on variance
## 
## data:  pressure
## X-squared = 733.52, df = 732, p-value = 0.9544
## alternative hypothesis: true variance is not equal to 153
## 95 percent confidence interval:
##  138.7477 170.3223
## sample estimates:
## variance of x 
##      153.3178
var.test(pressure, mass, ratio=3, alternative= "two.sided")
## 
##  F test to compare two variances
## 
## data:  pressure and mass
## F = 1.0657, num df = 732, denom df = 756, p-value = 0.3855
## alternative hypothesis: true ratio of variances is not equal to 3
## 95 percent confidence interval:
##  2.768995 3.691989
## sample estimates:
## ratio of variances 
##           3.197088
library(snpar)
runs.test(c(1, 1, 1, 1, 1, 1, 1, 1, 0, 0))
## 
##  Approximate runs rest
## 
## data:  c(1, 1, 1, 1, 1, 1, 1, 1, 0, 0)
## Runs = 2, p-value = 0.01287
## alternative hypothesis: two.sided
set.seed(5)
sample.vector <- rnorm(100000, 0, 1)
binary.vector <- ifelse(sample.vector < 0, 0, 1)
runs.test(binary.vector)
## 
##  Approximate runs rest
## 
## data:  binary.vector
## Runs = 50172, p-value = 0.268
## alternative hypothesis: two.sided
ks.test(glucose, pressure)
## Warning in ks.test(glucose, pressure): p-value will be approximate in the
## presence of ties
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  glucose and pressure
## D = 0.80399, p-value < 2.2e-16
## alternative hypothesis: two-sided
set.seed(5)
ks.test(rpois(100, 5), rpois(200, 5))
## Warning in ks.test(rpois(100, 5), rpois(200, 5)): p-value will be approximate in
## the presence of ties
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  rpois(100, 5) and rpois(200, 5)
## D = 0.07, p-value = 0.8996
## alternative hypothesis: two-sided
shapiro.test(insulin)
## 
##  Shapiro-Wilk normality test
## 
## data:  insulin
## W = 0.8041, p-value < 2.2e-16
set.seed(5)
shapiro.test(rnorm(5000, 0, 1))
## 
##  Shapiro-Wilk normality test
## 
## data:  rnorm(5000, 0, 1)
## W = 0.99961, p-value = 0.4446

5.2.4 Exercises

5.3 Multivariate Statistics

5.3.1 Correlation and bivariate statistics

ggplot(PimaIndiansDiabetes2, aes(x=mass, y=pressure)) +
  geom_point()
## Warning: Removed 39 rows containing missing values (geom_point).

ggplot(PimaIndiansDiabetes2, aes(x=mass, y=triceps)) +
  geom_point()
## Warning: Removed 229 rows containing missing values (geom_point).

ggplot(PimaIndiansDiabetes2, aes(x=mass, y=triceps)) +
  geom_point() +
  geom_vline(xintercept=mean(mass, na.rm=TRUE), colour="red", size=1.2) +
  geom_hline(yintercept=mean(triceps, na.rm=TRUE), colour="blue", size=1.2) +
  annotate("text", label="mean of mass", angle=90, x=34, y=75, size=4, colour="red") +
  annotate("text", label="mean of triceps", x=55, y=25, size=4, colour="blue")
## Warning: Removed 229 rows containing missing values (geom_point).

cov(mass, pressure, use="pairwise.complete.obs")
## [1] 24.64499
cov(mass, triceps, use="pairwise.complete.obs")
## [1] 46.72566
cor(mass, pressure, use="pairwise.complete.obs")
## [1] 0.2892303
cor(mass, triceps, use="pairwise.complete.obs")
## [1] 0.6482139
cor(PimaIndiansDiabetes2[, -9], use="pairwise.complete.obs")
##             pregnant   glucose     pressure   triceps    insulin       mass
## pregnant  1.00000000 0.1281346  0.214178483 0.1002391 0.08217103 0.02171892
## glucose   0.12813455 1.0000000  0.223191778 0.2280432 0.58118621 0.23277051
## pressure  0.21417848 0.2231918  1.000000000 0.2268391 0.09827230 0.28923034
## triceps   0.10023907 0.2280432  0.226839067 1.0000000 0.18488842 0.64821394
## insulin   0.08217103 0.5811862  0.098272299 0.1848884 1.00000000 0.22805016
## mass      0.02171892 0.2327705  0.289230340 0.6482139 0.22805016 1.00000000
## pedigree -0.03352267 0.1372457 -0.002804527 0.1150164 0.13039507 0.15538175
## age       0.54434123 0.2671356  0.330107425 0.1668158 0.22026068 0.02584146
##              pedigree        age
## pregnant -0.033522673 0.54434123
## glucose   0.137245741 0.26713555
## pressure -0.002804527 0.33010743
## triceps   0.115016426 0.16681577
## insulin   0.130395072 0.22026068
## mass      0.155381746 0.02584146
## pedigree  1.000000000 0.03356131
## age       0.033561312 1.00000000
library(corrplot)
## corrplot 0.84 loaded
corrplot(cor(PimaIndiansDiabetes2[,-9], use="pairwise.complete.obs"))

plot(PimaIndiansDiabetes2)

5.3.2 Linear Model

model <- lm(triceps ~ mass, data=PimaIndiansDiabetes2)
model$coefficients
## (Intercept)        mass 
##  -3.3734852   0.9894821
summary(model)
## 
## Call:
## lm(formula = triceps ~ mass, data = PimaIndiansDiabetes2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -19.764  -5.068  -0.612   5.021  68.038 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.37349    1.68557  -2.001   0.0459 *  
## mass         0.98948    0.05016  19.727   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.995 on 537 degrees of freedom
##   (229 observations deleted due to missingness)
## Multiple R-squared:  0.4202, Adjusted R-squared:  0.4191 
## F-statistic: 389.2 on 1 and 537 DF,  p-value: < 2.2e-16
summary(model)$r.squared
## [1] 0.4201813
ggplot(PimaIndiansDiabetes2, aes(x=mass, y=triceps)) +
  geom_point() +
  geom_smooth(method="lm", se=TRUE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 229 rows containing non-finite values (stat_smooth).
## Warning: Removed 229 rows containing missing values (geom_point).

predict(model, data.frame(mass=30), interval="confidence")
##        fit     lwr      upr
## 1 26.31098 25.5768 27.04515
library(psych)
pairs.panels(PimaIndiansDiabetes2[, -9],
method="pearson", hist.col="seashell",
density=TRUE , lm=TRUE)

5.3.3 Multivariable Linear Models

model <- lm(triceps ~ mass + glucose + pressure)
summary(model)
## 
## Call:
## lm(formula = triceps ~ mass + glucose + pressure)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -20.128  -4.878  -0.649   4.540  66.358 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -6.20853    2.43688  -2.548   0.0111 *  
## mass         0.95545    0.05410  17.662   <2e-16 ***
## glucose      0.02310    0.01171   1.972   0.0491 *  
## pressure     0.01637    0.03003   0.545   0.5860    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.008 on 528 degrees of freedom
##   (236 observations deleted due to missingness)
## Multiple R-squared:  0.4242, Adjusted R-squared:  0.4209 
## F-statistic: 129.7 on 3 and 528 DF,  p-value: < 2.2e-16
model <- lm(mass ~ ., data=PimaIndiansDiabetes2)
summary(model)
## 
## Call:
## lm(formula = mass ~ ., data = PimaIndiansDiabetes2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.9216  -3.3893  -0.7394   3.0061  21.5470 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 15.112359   1.850813   8.165 4.69e-15 ***
## pregnant    -0.228028   0.108476  -2.102   0.0362 *  
## glucose     -0.004484   0.011429  -0.392   0.6950    
## pressure     0.103319   0.021873   4.724 3.26e-06 ***
## triceps      0.398051   0.025744  15.462  < 2e-16 ***
## insulin      0.005744   0.002644   2.172   0.0304 *  
## pedigree     0.817712   0.760767   1.075   0.2831    
## age         -0.047715   0.036370  -1.312   0.1903    
## diabetespos  1.576717   0.659531   2.391   0.0173 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.01 on 383 degrees of freedom
##   (376 observations deleted due to missingness)
## Multiple R-squared:  0.5023, Adjusted R-squared:  0.4919 
## F-statistic: 48.31 on 8 and 383 DF,  p-value: < 2.2e-16
contrasts(diabetes)
##     pos
## neg   0
## pos   1
model <- lm(mass ~ . -glucose - pedigree - age, data=PimaIndiansDiabetes2)
summary(model)
## 
## Call:
## lm(formula = mass ~ . - glucose - pedigree - age, data = PimaIndiansDiabetes2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.1586  -3.3481  -0.6686   2.9879  21.5916 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 14.498506   1.526983   9.495  < 2e-16 ***
## pregnant    -0.325957   0.082872  -3.933 9.93e-05 ***
## pressure     0.095618   0.021376   4.473 1.02e-05 ***
## triceps      0.400059   0.025548  15.659  < 2e-16 ***
## insulin      0.004908   0.002251   2.180   0.0298 *  
## diabetespos  1.451142   0.595921   2.435   0.0153 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.01 on 386 degrees of freedom
##   (376 observations deleted due to missingness)
## Multiple R-squared:  0.4983, Adjusted R-squared:  0.4918 
## F-statistic: 76.69 on 5 and 386 DF,  p-value: < 2.2e-16

5.3.4 Nonlinear transformations

linear.price <- lm(price ~ x, data=diamonds)
summary(linear.price)$adj.r.squared
## [1] 0.7822215
ggplot(diamonds, aes(y=price , x=x)) +
  geom_point(alpha=.5) +
  geom_smooth(method="lm") +
  xlim(3, 11) +
  ylim(0, 19000)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 8 rows containing non-finite values (stat_smooth).
## Warning: Removed 8 rows containing missing values (geom_point).
## Warning: Removed 12 rows containing missing values (geom_smooth).

poly.price <- lm(price ~ poly(x, 2), data=diamonds)
summary(poly.price)$adj.r.squared
## [1] 0.8591606
ggplot(diamonds , aes(y=price , x=x)) +
  geom_point(alpha=.5) +
  geom_smooth(method="lm", formula=y ~ poly(x, 2)) +
  xlim(3, 11) +
  ylim(0, 19000)
## Warning: Removed 8 rows containing non-finite values (stat_smooth).
## Warning: Removed 8 rows containing missing values (geom_point).
## Warning: Removed 22 rows containing missing values (geom_smooth).

linear.carat <- lm(x ~ carat , data=diamonds)
summary(linear.carat)$adj.r.squared
## [1] 0.9508078
ggplot(diamonds, aes(y=x, x=carat)) +
  geom_point(alpha=.5) +
  geom_smooth(method="lm")
## `geom_smooth()` using formula 'y ~ x'

linear.carat <- lm(x ~ log(carat), data=diamonds)
summary(linear.carat)$adj.r.squared
## [1] 0.9804279
ggplot(diamonds, aes(y=x, x=carat)) +
  geom_point(alpha=.5) +
  geom_smooth(method="lm", formula=y ~ log(x))

contingency <- table(diabetes, age)
contingency
##         age
## diabetes 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
##      neg 58 61 31 38 34 25 24 25 16 15 11  7  7 10  5  6 13  6  9  7  9 11  2
##      pos  5 11  7  8 14  8  8 10 13  6 13  9 10  4  5 10  6 10  3  6 13  7 11
##         age
## diabetes 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
##      neg  3  7  6  2  4  2  3  3  1  1  2  3  1  4  4  1  3  1  2  4  1  3  2
##      pos  5  8  7  4  1  3  5  5  7  4  4  1  2  1  3  2  2  1  2  0  0  0  2
##         age
## diabetes 67 68 69 70 72 81
##      neg  2  1  2  0  1  1
##      pos  1  0  0  1  0  0
library(vcd)
## Loading required package: grid
assocstats(contingency)
##                     X^2 df   P(> X^2)
## Likelihood Ratio 150.06 51 1.0889e-11
## Pearson          140.94 51 2.3070e-10
## 
## Phi-Coefficient   : NA 
## Contingency Coeff.: 0.394 
## Cramer's V        : 0.428
boxplot(mass ~ diabetes, data=PimaIndiansDiabetes2, ylab="Mass")

aov <- aov(mass ~ diabetes)
summary(aov)
##              Df Sum Sq Mean Sq F value Pr(>F)    
## diabetes      1   3567    3567    82.4 <2e-16 ***
## Residuals   755  32687      43                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 11 observations deleted due to missingness
log.model <- glm(diabetes ~ ., data=PimaIndiansDiabetes2, family=binomial)
summary(log.model)
## 
## Call:
## glm(formula = diabetes ~ ., family = binomial, data = PimaIndiansDiabetes2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7823  -0.6603  -0.3642   0.6409   2.5612  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.004e+01  1.218e+00  -8.246  < 2e-16 ***
## pregnant     8.216e-02  5.543e-02   1.482  0.13825    
## glucose      3.827e-02  5.768e-03   6.635 3.24e-11 ***
## pressure    -1.420e-03  1.183e-02  -0.120  0.90446    
## triceps      1.122e-02  1.708e-02   0.657  0.51128    
## insulin     -8.253e-04  1.306e-03  -0.632  0.52757    
## mass         7.054e-02  2.734e-02   2.580  0.00989 ** 
## pedigree     1.141e+00  4.274e-01   2.669  0.00760 ** 
## age          3.395e-02  1.838e-02   1.847  0.06474 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 498.10  on 391  degrees of freedom
## Residual deviance: 344.02  on 383  degrees of freedom
##   (376 observations deleted due to missingness)
## AIC: 362.02
## 
## Number of Fisher Scoring iterations: 5
predictions <- predict(log.model , type="response")
head(predictions)
##          4          5          7          9         14         15 
## 0.02711452 0.89756363 0.03763559 0.85210016 0.79074609 0.71308818
fact.predictions <- rep("neg", length(diabetes))
fact.predictions[predictions > 0.5] <- "pos"
table(fact.predictions , diabetes)
##                 diabetes
## fact.predictions neg pos
##              neg 378 189
##              pos 122  79
mean(fact.predictions == diabetes, na.rm=TRUE)
## [1] 0.5950521
fact.predictions <- rep("neg", length(diabetes))
fact.predictions[predictions > 0.6]="pos"
table(fact.predictions, diabetes)
##                 diabetes
## fact.predictions neg pos
##              neg 400 198
##              pos 100  70
mean(fact.predictions == diabetes, na.rm=TRUE)
## [1] 0.6119792
log.model <- glm(diabetes ~ glucose, data=PimaIndiansDiabetes2, family=binomial)
new.data <- data.frame(glucose=138, mass=35.5, pedigree=0.325)
predict(log.model, newdata=new.data, type="response")
##        1 
## 0.473114

5.3.6 Exercises

5.4 Data analysis of the flights database

## 
## Attaching package: 'data.table'
## The following object is masked from 'package:DescTools':
## 
##     %like%
summary(flights)
##     icao24            callsign           country            longitude      
##  Length:3822909     Length:3822909     Length:3822909     Min.   :-163.42  
##  Class :character   Class :character   Class :character   1st Qu.: -89.98  
##  Mode  :character   Mode  :character   Mode  :character   Median : -12.82  
##                                                           Mean   : -22.12  
##                                                           3rd Qu.:  24.10  
##                                                           Max.   : 179.07  
##     latitude      altitude.baro        velocity          track        
##  Min.   :-57.13   Min.   : -960.1   Min.   :   0.0   Min.   :-172.63  
##  1st Qu.: 31.73   1st Qu.: 3794.8   1st Qu.: 151.2   1st Qu.:  92.07  
##  Median : 38.40   Median : 9448.8   Median : 206.9   Median : 191.10  
##  Mean   : 33.98   Mean   : 7668.4   Mean   : 189.8   Mean   : 183.47  
##  3rd Qu.: 44.94   3rd Qu.:10972.8   3rd Qu.: 236.3   3rd Qu.: 270.83  
##  Max.   : 88.50   Max.   :38648.6   Max.   :2384.5   Max.   : 359.90  
##  vertical.rate            time       
##  Min.   :-1541.140   Min.   :     0  
##  1st Qu.:   -0.650   1st Qu.:203400  
##  Median :    0.000   Median :395100  
##  Mean   :   -0.193   Mean   :391961  
##  3rd Qu.:    0.330   3rd Qu.:585890  
##  Max.   : 7764.780   Max.   :777600
library(corrplot)
corrplot(cor(flights[, -c(1, 2, 3)], use="pairwise.complete.obs"))

library(vcd)
assocstats(table(flights$callsign, flights$country))
##                        X^2       df P(> X^2)
## Likelihood Ratio  20977515 21686635        1
## Pearson          532547665 21686635        0
## 
## Phi-Coefficient   : NA 
## Contingency Coeff.: 0.996 
## Cramer's V        : 0.98
model.velocity <- lm(velocity ~ altitude.baro, data=flights)
summary(model.velocity)
## 
## Call:
## lm(formula = velocity ~ altitude.baro, data = flights)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -565.89  -23.36   -0.06   24.00 2150.63 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   8.749e+01  3.967e-02    2205   <2e-16 ***
## altitude.baro 1.334e-02  4.564e-06    2922   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36.53 on 3822907 degrees of freedom
## Multiple R-squared:  0.6908, Adjusted R-squared:  0.6908 
## F-statistic: 8.54e+06 on 1 and 3822907 DF,  p-value: < 2.2e-16
model.velocity <- lm(velocity ~ longitude + latitude + altitude.baro, data=flights)
summary(model.velocity)
## 
## Call:
## lm(formula = velocity ~ longitude + latitude + altitude.baro, 
##     data = flights)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -567.15  -23.60   -0.88   23.77 2146.62 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   8.695e+01  4.805e-02 1809.75   <2e-16 ***
## longitude     5.542e-02  2.456e-04  225.60   <2e-16 ***
## latitude      6.625e-02  9.723e-04   68.14   <2e-16 ***
## altitude.baro 1.327e-02  4.573e-06 2902.15   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36.29 on 3822905 degrees of freedom
## Multiple R-squared:  0.6948, Adjusted R-squared:  0.6948 
## F-statistic: 2.901e+06 on 3 and 3822905 DF,  p-value: < 2.2e-16
model.velocity <- lm(velocity ~ altitude.baro + track, data=flights)
summary(model.velocity)
## 
## Call:
## lm(formula = velocity ~ altitude.baro + track, data = flights)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -584.87  -20.64    0.89   22.42 2146.45 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.082e+02  4.977e-02  2173.2   <2e-16 ***
## altitude.baro  1.329e-02  4.340e-06  3063.4   <2e-16 ***
## track         -1.110e-01  1.743e-04  -636.9   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 34.73 on 3822906 degrees of freedom
## Multiple R-squared:  0.7204, Adjusted R-squared:  0.7204 
## F-statistic: 4.926e+06 on 2 and 3822906 DF,  p-value: < 2.2e-16
model.vertical <- lm(vertical.rate ~ longitude + latitude + altitude.baro + velocity, data=flights)
summary(model.vertical)
## 
## Call:
## lm(formula = vertical.rate ~ longitude + latitude + altitude.baro + 
##     velocity, data = flights)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1541.0    -0.6     0.0     0.4  7764.9 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -1.134e+00  2.005e-02 -56.544  < 2e-16 ***
## longitude     -3.130e-04  7.573e-05  -4.133 3.59e-05 ***
## latitude      -1.871e-03  2.980e-04  -6.280 3.40e-10 ***
## altitude.baro -2.266e-06  2.507e-06  -0.904    0.366    
## velocity       5.347e-03  1.566e-04  34.134  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.11 on 3822904 degrees of freedom
## Multiple R-squared:  0.0009354,  Adjusted R-squared:  0.0009343 
## F-statistic: 894.8 on 4 and 3822904 DF,  p-value: < 2.2e-16